library(tximport) 
library(DESeq2)
library(pheatmap)
library(AnnotationDbi)
library(EnhancedVolcano)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(RColorBrewer)
library(EnsDb.Hsapiens.v86)
library(msigdbr)
library(clusterProfiler)
library(ggpubr)
library(ggVennDiagram)
library(VennDiagram)
library(enrichR)
library(enrichplot)
library(DT)
dbs <- c("KEGG_2019_Human", "BioPlanet_2019")
enrichplotfxn <- function(x){
  x %>%
    top_n(10, Combined.Score) %>% 
    arrange(desc(Combined.Score)) %>%
    mutate(Term = factor(Term, levels = rev(Term))) %>%
    ggplot(aes(x = Term, y = Combined.Score, fill = -log10(Adjusted.P.value))) +
    geom_bar(stat = "identity") +
    theme_bw(base_size = 12) +
    xlab(NULL) +
    scale_x_discrete(labels = function(x) str_wrap(x, width=30)) +
    labs(fill = "-Log10 \n Adjusted \n P Value") +
    ylab("Combined Score") +
    theme(plot.title = element_text(size = 12, hjust = 1), legend.title =element_text(size=9)) +
    coord_flip()
}
gene_sets <- msigdbr(species = "Homo sapiens", category = "C5")
gene_sets <- gene_sets %>%
  dplyr::select(gs_name, gene_symbol)

Introduction

The following analysis is a Differential Gene Expression Analysis exploring the RNA-seq data of Mesenchymal Stem Cells (MSCs), and Ewing Sarcoma Cells (TC32) that were both treated with the spliceosome-inhibiting drug E7101. This analysis was completed for the R-loops and Splicing project.

Questions to Answer:

  1. What are the genes which are overexpressed and underexpressed with E7107 treatment in MSCs and TC32? What pathways do they relate to?
  2. What are the DEGs with E7107 treatment in both cell lines? What pathways do they relate to?
  3. What are the DEGs which are exclusive to only TC32 cells? What pathways do they relate to?

Effects of E7107

Question #1: What are the genes which are overexpressed and underexpressed with E7107 treatment in MSCs and TC32? What pathways do they relate to?

Data Import and Inspection

The data was imported from Salmon quantification files using tximport. A variance-stabilizing transformation was applied to the count data and a principal component analysis (PCA) plot was generated. Upon inspection of the PCA, it was determined that one of the MSC samples had an unacceptable level of divergence and therefore should be excluded. The non-treated MSC sample was censored from the rest of the analysis.

#Transcript to Gene Annotation
edb <- EnsDb.Hsapiens.v86
k <- keys(edb, keytype = "TXNAME")
tx2gene <- AnnotationDbi::select(x = edb, keys = k, columns = "SYMBOL", 
                                 keytype ="TXNAME") 
# "_uc" = "Uncensored"

#Create a data frame with sample names, and condition
sampleTable_uc <- data.frame(
  sample_id = c("MSC.1", "MSC.2", "MSC.3", "MSCE7.1", "MSCE7.2", "MSCE7.3","TC32.1", 
                "TC32.2", "TC32.3", "TC32E7.1", "TC32E7.2", "TC32E7.3"),
  condition = factor(rep(c("NT", "E7107"), times = 2, each = 3)),
  cell = c(rep("MSC",6), rep("TC32", 6)))

#Create named list pointing to quantification file locations
dir <-  "data/rnaseq_TC32_MSC_E7107/salmon_out/"
files_uc <- file.path(dir, sampleTable_uc$sample_id, "quant.sf")
names(files_uc) <- sampleTable_uc$sample_id

#Import data from Salmon quant files
txi.salmon_uc <- tximport(files_uc, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)

#DESeq Dataset:
dds_uc <- DESeqDataSetFromTximport(txi = txi.salmon_uc, colData = sampleTable_uc, design = ~condition)

#QC Visualization:
#Variance Stabilization
vsd_uc <- vst(dds_uc, blind = FALSE)

#PCA plot
plotPCA(vsd_uc, intgroup = c("condition", "cell")) +
  ggtitle("PCA of Condition")+
  coord_fixed(ratio = 3)#Appears to be an outlier

#Only MSC Line: 
MSC_idx_uc <- grep("MSC", names(files_uc))
MSC_files_uc <- files_uc[MSC_idx_uc]

MSC_txi.salmon_uc <- tximport(MSC_files_uc, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
#Sample table and DESeq Dataset:
MSC_sampleTable_uc <- sampleTable_uc[grep("MSC", sampleTable_uc$sample_id),]%>% select("condition")

MSC_dds_uc <- DESeqDataSetFromTximport(txi = MSC_txi.salmon_uc, colData = MSC_sampleTable_uc, 
                                    design = ~condition)

#Variance Stabilization, PCA Plot, and Heatmap
MSC_vsd_uc <- vst(MSC_dds_uc, blind = FALSE)
plotPCA(MSC_vsd_uc) + ggtitle("MSC PCA of Condition")  #Need to remove MSC.2 going forward

The subsequent PCA shows a distinct clustering of the cell lines. This clustering is echoed in the hierarchical heatmap showing that the cell lines have high correlation between samples. The distinct grouping between the two cell lines denote high quality data and increase the reliability of biologically relevant information that is detected.

#Import Data:

#Transcript to Gene Annotation
edb <- EnsDb.Hsapiens.v86
k <- keys(edb, keytype = "TXNAME")
tx2gene <- AnnotationDbi::select(x = edb, keys = k, columns = "SYMBOL", 
                                 keytype ="TXNAME") 

#Create a data frame with sample names, and condition
sampleTable <- data.frame(
  sample_id = c("MSC.1", "MSC.2", "MSC.3", "MSCE7.1", "MSCE7.2", "MSCE7.3","TC32.1", 
                "TC32.2", "TC32.3", "TC32E7.1", "TC32E7.2", "TC32E7.3"),
  condition = factor(rep(c("NT", "E7107"), times = 2, each = 3)),
  cell = c(rep("MSC",6), rep("TC32", 6)))
sampleTable <- sampleTable[which((sampleTable$sample_id) != "MSC.2"),]  #Censor MSC2

#Create named list pointing to quantification file locations
dir <-  "data/rnaseq_TC32_MSC_E7107/salmon_out/"
files <- file.path(dir, sampleTable$sample_id, "quant.sf")
names(files) <- sampleTable$sample_id

#Import data from Salmon quant files
txi.salmon <- tximport(files, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)

#DESeq Dataset:
dds <- DESeqDataSetFromTximport(txi = txi.salmon, colData = sampleTable, design = ~condition)

#QC Visualization:

#Variance Stabilization
vsd <- vst(dds, blind = FALSE)

#PCA plot
PCA_plot <- plotPCA(vsd, intgroup = c("condition", "cell")) + coord_fixed(4)
PCA_plot +
  ggtitle("PCA of Condition")+
  coord_fixed(ratio = 7)

#Hierarchical Heatmap using correlation values between samples
vsd_matrix <- assay(vsd)
vsd_corr <- cor(vsd_matrix)
pheatmap(vsd_corr,  main = "Hierarchical Heatmap")

Effect of E7017 on TC32-specific DEGs

Only the differentially expressed genes that were exclusive to the TC32 cells were analyzed to assess what biological differences are specifically being detected in the Ewing Sarcoma cells. Again, the analysis was completed using the same parameters as above.

#TC32-Specific Gene Set - These are significantly differentially Expressed
'%!in%' <- Negate('%in%')
TC32specific_over_genes <- TC32_overexpressed[TC32_overexpressed$Gene %!in% MSC_overexpressed$Gene,] %>%
  write_csv(file = "analysis/DGE/csv_files/TC32-specific_OverexpressedGenes.csv")

TC32specific_under_genes <- TC32_underexpressed[TC32_underexpressed$Gene %!in% MSC_underexpressed$Gene,]%>%
  write_csv(file = "analysis/DGE/csv_files/TC32-specific_UnderexpressedGenes.csv")

#Set up EnrichR Permalinks
Enrich_Lst_TC32only <- list( "TC32only_Over" = TC32specific_over_genes$Gene, 
                         "TC32only_Under" = TC32specific_under_genes$Gene)
enrichLinks_TC32only <- lapply(names(Enrich_Lst_TC32only), function(group) {
  genes<- Enrich_Lst_TC32only[[group]]
  response <- httr::POST(url = 'https://maayanlab.cloud/Enrichr/addList', body = list(
    'list' = paste0(genes, collapse = "\n"),
    'description' = group
  ))
  jsonlite::fromJSON(httr::content(response, as = "text"))
})
names(enrichLinks_TC32only) <- names(Enrich_Lst_TC32only)

permalinks_TC32only <- lapply(names(enrichLinks_TC32only), function(x){
  paste0("https://maayanlab.cloud/Enrichr/enrich?dataset=", enrichLinks_TC32only[[x]]$shortId)
  })
names(permalinks_TC32only) <- names(Enrich_Lst_TC32only)
TC32-specific Enrichr Results:

For a broader representation of the data, both the KEGG and BioPlanet database results from the over-representation analysis.

TC32-specific Overexpressed:

#EnrichR Enrichment Analysis:
enrichr_Lst_TC32only <- lapply(Enrich_Lst_TC32only, function(x){enrichr(x, dbs)})
## Uploading data to Enrichr... Done.
##   Querying KEGG_2019_Human... Done.
##   Querying BioPlanet_2019... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
##   Querying KEGG_2019_Human... Done.
##   Querying BioPlanet_2019... Done.
## Parsing results... Done.
#Plot Results:
enrichplotfxn(enrichr_Lst_TC32only$TC32only_Over$KEGG_2019_Human) + 
  labs(title = "Top KEGG Pathways Enriched in Significantly Over-Expressed Genes Specific to TC32")
enrichplotfxn(enrichr_Lst_TC32only$TC32only_Over$BioPlanet_2019) + 
  labs(title = "Top BioPlanet Pathways Enriched in Significantly Over-Expressed Genes Specific to TC32")

TC32-specific Underexpressed:

#Plot Results:
enrichplotfxn(enrichr_Lst_TC32only$TC32only_Under$KEGG_2019_Human) + 
  labs(title = "Top KEGG Pathways Enriched in Significantly Underexpressed Genes Specific to TC32")
enrichplotfxn(enrichr_Lst_TC32only$TC32only_Under$BioPlanet_2019) + 
  labs(title = "Top BioPlanet Pathways Enriched in Significantly Underexpressed Genes Specific to TC32")

<<<<<<< HEAD

======= >>>>>>> 45bf06fde9fe572ed335590d9a80fc6c0320a88b # Differentially Expressed Gene Data Data for all of the included DE Genes: ### MSC DEG Data:

#MSC
MSC_data <- MSC_res_df %>% 
  replace_na(list(padj=1)) %>%
  drop_na()
MSC_datatable <- datatable(MSC_data)
MSC_datatable

TC32 DEG Data:

#TC32
TC32_data <- TC32_res_df %>% 
  replace_na(list(padj=1)) %>%
  drop_na()
TC32_datatable <- datatable(TC32_data)
TC32_datatable